add toupper/tolower functions (for JuliaLang/julia#11471)

author Steven G. Johnson <stevenj@mit.edu>

Fri, 29 May 2015 17:52:48 +0000 (13:52 -0400)

committer Steven G. Johnson <stevenj@mit.edu>

Sat, 30 May 2015 02:00:30 +0000 (22:00 -0400)
author Steven G. Johnson <stevenj@mit.edu>
Fri, 29 May 2015 17:52:48 +0000 (13:52 -0400)
committer Steven G. Johnson <stevenj@mit.edu>
Sat, 30 May 2015 02:00:30 +0000 (22:00 -0400)
diff --git a/.gitignore b/.gitignore

index 0961a6bd0a9c67ebc90ccb6c4172f1e792a92a87..4c9b2dfb8f55d89f6f58d81d97c9c44b5f86ba20 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -22,4 +22,5 @@ utf8proc_data.c.new
  printproperty
  charwidth
  valid
-iterate
-\ No newline at end of file
+iterate
+case
diff --git a/Makefile b/Makefile

index e29c349151abdeddd2e12a1187e02a2234767982..45b1ed703e8431233a1582ae890a877d0295690b 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -111,10 +111,14 @@ test/valid: test/valid.c utf8proc.o utf8proc.h test/tests.h
  test/iterate: test/iterate.c utf8proc.o utf8proc.h test/tests.h
         $(cc) test/iterate.c utf8proc.o -o $@
  
-check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
+test/case: test/case.c utf8proc.o utf8proc.h test/tests.h
+       $(cc) test/case.c utf8proc.o -o $@
+
+check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o
         $(MAKE) -C bench
         test/normtest data/NormalizationTest.txt
         test/graphemetest data/GraphemeBreakTest.txt
         test/charwidth
         test/valid
         test/iterate
+       test/case
diff --git a/test/case.c b/test/case.c

new file mode 100644 (file)

index 0000000..39958e3
--- /dev/null
+++ b/test/case.c
@@ -0,0 +1,50 @@
+#include "tests.h"
+#include <wctype.h>
+
+int main(int argc, char **argv)
+{
+     int error = 0, better = 0;
+     utf8proc_int32_t c;
+
+     (void) argc; /* unused */
+     (void) argv; /* unused */
+
+     /* some simple sanity tests of the character widths */
+     for (c = 0; c <= 0x110000; ++c) {
+          utf8proc_int32_t l = utf8proc_tolower(c);
+          utf8proc_int32_t u = utf8proc_toupper(c);
+
+          check(l == c || utf8proc_codepoint_valid(l), "invalid tolower");
+          check(u == c || utf8proc_codepoint_valid(u), "invalid toupper");
+
+          if (sizeof(wint_t) > 2 || c < (1<<16)) {
+               wint_t l0 = towlower(c), u0 = towupper(c);
+               
+               /* OS unicode tables may be out of date.  But if they
+                  do have a lower/uppercase mapping, hopefully it
+                  is correct? */
+               if (l0 != c && l0 != l) {
+                    fprintf(stderr, "MISMATCH %x != towlower(%x) == %x\n",
+                            l, c, l0);
+                    ++error;
+               }
+               else if (l0 != l) { /* often true for out-of-date OS unicode */
+                    ++better;
+                    /* printf("%x != towlower(%x) == %x\n", l, c, l0); */
+               }
+               if (u0 != c && u0 != u) {
+                    fprintf(stderr, "MISMATCH %x != towupper(%x) == %x\n",
+                            u, c, u0);
+                    ++error;
+               }
+               else if (u0 != u) { /* often true for out-of-date OS unicode */
+                    ++better;
+                    /* printf("%x != towupper(%x) == %x\n", u, c, u0); */
+               }
+          }
+     }
+     check(!error, "utf8proc case conversion FAILED %d tests.", error);
+     printf("More up-to-date than OS unicode tables for %d tests.\n", better);
+     printf("utf8proc case conversion tests SUCCEEDED.\n");
+     return 0;
+}
diff --git a/utf8proc.c b/utf8proc.c

index 971b87ad18b39a0ce1e6d449b35019038e368071..80f5ba8fd2ad154b250a7ce5638222cabd995ce0 100644 (file)
--- a/utf8proc.c
+++ b/utf8proc.c
@@ -264,6 +264,18 @@ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, ut
                          utf8proc_get_property(c2)->boundclass);
  }
  
+UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c)
+{
+  utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_mapping;
+  return cl >= 0 ? cl : c;
+}
+
+UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c)
+{
+  utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_mapping;
+  return cu >= 0 ? cu : c;
+}
+
  /* return a character width analogous to wcwidth (except portable and
     hopefully less buggy than most system wcwidth functions). */
  UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) {
diff --git a/utf8proc.h b/utf8proc.h

index 710821524f6621694623b24eddec9d6c4e0c96f0..59f2425f57c507a710763ca92ea3d4b6629a7a8c 100644 (file)
--- a/utf8proc.h
+++ b/utf8proc.h
@@ -511,6 +511,21 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer,
   */
  UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2);
  
+
+/**
+ * Given a codepoint `c`, return the codepoint of the corresponding
+ * lower-case character, if any; otherwise (if there is no lower-case
+ * variant, or if `c` is not a valid codepoint) return `c`.
+ */
+UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c);
+
+/**
+ * Given a codepoint `c`, return the codepoint of the corresponding
+ * upper-case character, if any; otherwise (if there is no upper-case
+ * variant, or if `c` is not a valid codepoint) return `c`.
+ */
+UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c);
+
  /**
   * Given a codepoint, return a character width analogous to `wcwidth(codepoint)`,
   * except that a width of 0 is returned for non-printable codepoints
author	Steven G. Johnson <stevenj@mit.edu>
	Fri, 29 May 2015 17:52:48 +0000 (13:52 -0400)
committer	Steven G. Johnson <stevenj@mit.edu>
	Sat, 30 May 2015 02:00:30 +0000 (22:00 -0400)
.gitignore		patch \| blob \| history
Makefile		patch \| blob \| history
test/case.c	[new file with mode: 0644]	patch \| blob
utf8proc.c		patch \| blob \| history
utf8proc.h		patch \| blob \| history